In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from collections import defaultdict
from cyvcf2 import VCF

# Import colour palettes for later on
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.palettes import Category20b
from bokeh.palettes import Purples
from bokeh.palettes import Greens
from bokeh.palettes import YlOrBr
from bokeh.palettes import YlOrRd
from bokeh.palettes import PuOr
from bokeh.palettes import RdGy
from bokeh.plotting import figure, show, output_file
output_notebook()
Loading BokehJS ...
In [2]:
# Reference: https://github.com/diazale/1KGP_dimred/blob/master/Genotype_dimred_demo.ipynb

vcf_name = '1000-genomes-snp/ALL.wgs.nhgri_coriell_affy_6.20140825.genotypes_has_ped.vcf.gz'
pop_desc_name = '1000-genomes-snp/20131219.populations.tsv'
pop_file_name = '1000-genomes-snp/affy_samples.20141118.panel'

# get samples
individuals = VCF(vcf_name).samples

# get pop by continent
name_by_code = {}
pop_by_continent = defaultdict(list)
with open(pop_desc_name ,'r') as f:
    lines = f.readlines()
for line in lines:
    split_line = line.split('\t')
    if split_line[0] in ['Population Description','Total','']:  # header or footer
        continue
    name_by_code[split_line[1]] = split_line[0]
    pop_by_continent[split_line[2]].append(split_line[1])
continents = list(pop_by_continent.keys())
pops=[]
for continent in continents:
    pops.extend(pop_by_continent[continent])

# get pop by individ and individs by pop
population_by_individual = defaultdict(int)
individuals_by_population = defaultdict(list)
with open(pop_file_name ,'r') as f:
    lines = f.readlines()
for line in lines:
    split_line = line.split()
    if split_line[0] == 'sample':  # header line
        continue
    sample_name = split_line[0]
    population_name = split_line[1]
    population_by_individual[sample_name] = population_name
    individuals_by_population[population_name].append(sample_name) 

# get indices of pop members by pop
indices_of_population_members = defaultdict(list)
for idx, individual in enumerate(individuals):
    try:
        indices_of_population_members[population_by_individual[individual]].append(idx)
    except KeyError: # We do not have population info for this individual
        continue
        
targets = []
for ind in individuals:
    pop = population_by_individual[ind]
    targets.append(pop + ' - ' + name_by_code[pop])
In [3]:
# Assign colours to each population, roughly themed according to continent
# The Category20b palette has a bunch of groups of 4 shades in the same colour range
color_dict = {}
for i, cont in enumerate(continents): 
    for j, pop in enumerate(pop_by_continent[cont]):
        color_dict[pop] = Category20b[20][4*i+j%4]

# Colour palette above only really supports groups of 4 so we have to manually specify a few colours for the 5th/6th
# members of a group

color_dict['CHS'] = Purples[9][4]# purple
color_dict['STU'] = Greens[9][6] # green
color_dict['LWK'] = PuOr[11][-1] # brown
color_dict['MSL'] = PuOr[11][-2] # rusty brown
color_dict['YRI'] = PuOr[11][-3] # cappucino w/ extra milk (stirred)
color_dict['CEU'] = RdGy[11][-3]
In [4]:
def visualize(file_name, name=None, targets=None, target_names=None):
    with open(file_name, 'rb') as f:
        pcs = np.load(f)
    if name is None:
        name = '.'.join(file_name.split('/')[-1].split('.')[:-1])
    
    TOOLTIPS = [
        ("index", "$index"),
        ("(x,y)", "($x, $y)"),
        ("label", "@label"),
    ]
    p = figure(plot_width=1350, plot_height=800, tooltips=TOOLTIPS)
    p.title.text = name
    if targets is None:
        for pop in pops:
            pop_indices = indices_of_population_members[pop]
            source = ColumnDataSource(
                data={'x': pcs[pop_indices, 0], 'y': pcs[pop_indices, 1], 'label': [name_by_code[pop]] * len(pop_indices)}
            )
    #         print(pop, name_by_code[pop])
            p.circle('x', 'y', legend_label=name_by_code[pop], color = color_dict[pop], source=source)
    else:
        targets = np.asarray(targets)
        n_clusters = int(max(targets) + 1)
        for i in range(1, n_clusters):
            pop_indices = (targets == i)
            pop_indices = np.where(pop_indices)[0]
            source = ColumnDataSource(
                data={'x': pcs[pop_indices, 0], 'y': pcs[pop_indices, 1], 'label': [' / '.join(target_names[i])] * len(pop_indices)}
            )
    #         print(pop, name_by_code[pop])
            p.circle('x', 'y', legend_label=' / '.join(target_names[i]), color = Category20b[20][i % 20], source=source)
    p.legend.label_text_font_size = '8pt'
#     p.legend.location = "top_left"
    p.legend.click_policy="hide"
#     p.legend.visible=False
    p.add_layout(p.legend[0], 'right')
    if targets is None:
        output_file('htmls/' + name + '.html')
    
    show(p)
In [5]:
# PCA
visualize('new_1kgenome_2pcs.npy')
In [6]:
# t-SNE
visualize('new_1kgenome_tsne_p30.npy')
In [7]:
# UMAP
visualize(f'new_1kgenome_umap_nn15_md0.5.npy')
In [8]:
# PCA -> 15pcs -> UMAP
visualize(f'new_1kgenome_15pcs-umap_nn15_md0.5.npy')
In [9]:
# PCA -> 15pcs -> UMAP with different random seeds -- shows how unstable UMAP is
for r in range(1, 6):
    visualize(f'new_1kgenome_15pcs-umap_randomstate{r}.npy')
In [10]:
# Vanilla Adversarial Autoencoder
for std in [0.5, 1, 2]:
    for lr in ['0.001', '0.0005', '0.0001']:
        name = f'new_1k_vanilla_lr{lr}_bs128_std{std}'
        visualize(f'../viz-autoencoder/{name}/res.npy', name=name)
In [11]:
# Adversarial Autoencoder with learnable cluster heads
for nc in [10, 20, 50]:
    for std in [0.5, 1, 2]:
        for lr in ['0.001', '0.0005', '0.0001']:
            name = f'new_1k_dim_{nc}_lr{lr}_bs32_std{std}'
            visualize(f'../viz-autoencoder/{name}/res.npy', name=name)
In [ ]:
# Adversarial Autoencoders with cluster heads as geographical locations of countries
for nc in [20]:
    for std in [0.5, 1, 2]:
        for lr in ['0.001', '0.0005', '0.0001']:
            name = f'new_1k_dim_{nc}_lr{lr}_bs32_std{std}_map'
            visualize(f'../viz-autoencoder/{name}/res.npy', name=name)
In [ ]: